The dataset contains 206 attributes of 70 children with physical and motor disability based on ICF-CY.
# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
sns.set()
import plotly.express as px
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,ElasticNetCV
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from sklearn.model_selection import KFold,StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
sc = pd.read_csv('SCADI.csv')
print(f"Shape of the Data:{sc.shape}\nNumber of Records:{sc.shape[0]}\nNumber of features:{sc.shape[1]}")
sc.head()
Shape of the Data:(70, 206) Number of Records:70 Number of features:206
| Gender | Age | d 5100-0 | d 5100-1 | d 5100-2 | d 5100-3 | d 5100-4 | d 5100-8 | d 5100-9 | d 5101-0 | ... | d 57022-8 | d 57022-9 | d 571-0 | d 571-1 | d 571-2 | d 571-3 | d 571-4 | d 571-8 | d 571-9 | Classes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 18 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | class6 |
| 1 | 0 | 22 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | class6 |
| 2 | 0 | 18 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | class6 |
| 3 | 1 | 18 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | class6 |
| 4 | 0 | 19 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | class6 |
5 rows × 206 columns
# Checking the columns exclude attributes 1, 2 and 206
sc.columns[2:-1]
Index(['d 5100-0', 'd 5100-1', 'd 5100-2', 'd 5100-3', 'd 5100-4', 'd 5100-8',
'd 5100-9', 'd 5101-0', 'd 5101-1', 'd 5101-2',
...
'd 57022-4', 'd 57022-8', 'd 57022-9', 'd 571-0', 'd 571-1', 'd 571-2',
'd 571-3', 'd 571-4', 'd 571-8', 'd 571-9'],
dtype='object', length=203)
len(sc.columns[2:-1])
203
print(f"SHape of the Dataset:{sc.shape}")
SHape of the Dataset:(70, 206)
print(f"Unique labels in classes column: {sc['Classes'].unique()}")
print(f"Unique number of labels in classes column: {sc['Classes'].nunique()}")
Unique labels in classes column: ['class6' 'class2' 'class4' 'class7' 'class1' 'class5' 'class3'] Unique number of labels in classes column: 7
X = sc.drop(columns = ['Gender','Age','Classes'])
X.shape
(70, 203)
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X
array([[-0.2773501 , -0.40824829, -0.61036794, ..., 3.26598632,
0. , 0. ],
[-0.2773501 , -0.40824829, -0.61036794, ..., -0.30618622,
0. , 0. ],
[-0.2773501 , -0.40824829, -0.61036794, ..., -0.30618622,
0. , 0. ],
...,
[-0.2773501 , -0.40824829, -0.61036794, ..., -0.30618622,
0. , 0. ],
[-0.2773501 , -0.40824829, -0.61036794, ..., -0.30618622,
0. , 0. ],
[-0.2773501 , -0.40824829, -0.61036794, ..., -0.30618622,
0. , 0. ]])
wcss = [] # empty list to store distortions for differnt number of clusters
values = range(2,10)
li = []
for k in values:
kmm = KMeans(n_clusters=k,random_state=42)
kmm.fit(scaled_X)
wcss.append(kmm.inertia_)
li.append(f"For n_clusters = {k},inertia :{kmm.inertia_}")
fig = px.line(x=values, y=wcss,title = 'Distortions vs number of clusters',
labels={"x":"Number_of_clusters","y":"Distortions"},
height=500,width=900,markers=True)
fig.show()
li
['For n_clusters = 2,inertia :8067.6498830395285', 'For n_clusters = 3,inertia :6944.268517928714', 'For n_clusters = 4,inertia :6091.816110772894', 'For n_clusters = 5,inertia :5670.663840404521', 'For n_clusters = 6,inertia :5271.766858895061', 'For n_clusters = 7,inertia :4880.495550659746', 'For n_clusters = 8,inertia :4584.945327728501', 'For n_clusters = 9,inertia :4464.50887460046']
From the above Graph and list, It looks as the elbow is made at Cluster : 4 with mean distortion = 6091.81
and Cluster : 7 has mean distortion = 4880.49,
with this in mind we could go for clusters 4 or cluster 7 as there is significant decrease of distortion from cluster 3 to cluster 4 and there is not much decrease in distortions between cluster 5 and cluster 4.
SImilarly From Cluster 6 to cluster 7 there is significant decrease in distortion but not much decrease in distortion from cluster 7 to cluster 8
So We can have clusters 4 or clusters 7.To check further we will go for Silhoutte Score and the clusters which will have maximum score can be considered the final cluster
sl = []
# values = range(2,50)
for k in values:
kmeans = KMeans(n_clusters=k,random_state=42)
predict = kmeans.fit_predict(scaled_X)
score = silhouette_score(scaled_X,predict,random_state=1)
sl.append(score)
print(f"For n_clusters = {k},silhouette score :{score}")
For n_clusters = 2,silhouette score :0.1663380133050078 For n_clusters = 3,silhouette score :0.19336960896484867 For n_clusters = 4,silhouette score :0.21962896326983072 For n_clusters = 5,silhouette score :0.2010513972240522 For n_clusters = 6,silhouette score :0.21375517781568876 For n_clusters = 7,silhouette score :0.2300238873038162 For n_clusters = 8,silhouette score :0.20313742323883902 For n_clusters = 9,silhouette score :0.17900613645430866
fig = px.line(x=values, y=sl,title = 'silhouette score',
labels={"x":"number_of_clusters","y":"silhouette score"},
height=500,width=1000,text=values)
fig.show()
From the above Graph, We can see that there are 2 peaks in the graph for cluster 4 and cluster 7 respectively
CLuster 7 has a score of 0.23
Cluster 4 has a score of 0.21
This shows that we are more inclined to choose 7 clusters or 7 classes for our problem
And the problem statement with given dataset has stated 7 classes itself.
Final Interpretation:--
Initially we have been provided with a dataset that contains 206 attributes of 70 children with physical and motor disability based on ICF-CY.
This data is segmented into 7 different classes based on 206 features and only 70 records
In the modelling technique used, our aim is to divide the children with given features to subgroup them into various classes/segments and able to check the result that is already given in the dataset i.e. the number of 7 classes or not using clustering techniques
Here , Kmeans clustering is used in modelling and to find te optimal number of classes /groups/segments that will be defined by Kmeans Algorithm,
Further to determine optimal number of clusters,elbow & Silhoutte score method is used to find the final clusters
From above , the Kmeans algorithm is able to create same number of 7 classes/clusters as it is given in the problem
print(f"Shape of the dataset:{sc.shape}")
print(f"Number of Records in the dataset:{sc.shape[0]}")
print(f"Number of Features in the dataset:{sc.shape[1]}")
sc.head(2)
Shape of the dataset:(70, 206) Number of Records in the dataset:70 Number of Features in the dataset:206
| Gender | Age | d 5100-0 | d 5100-1 | d 5100-2 | d 5100-3 | d 5100-4 | d 5100-8 | d 5100-9 | d 5101-0 | ... | d 57022-8 | d 57022-9 | d 571-0 | d 571-1 | d 571-2 | d 571-3 | d 571-4 | d 571-8 | d 571-9 | Classes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 18 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | class6 |
| 1 | 0 | 22 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | class6 |
2 rows × 206 columns
Yes , the data is facing the curse of dimensionalty as the number of records are greater than the number of features.
Due to this Some of the difficulties that come with high dimensional data manifest during analyzing or visualizing the data to identify patterns, and some manifest while training machine learning models. The difficulties related to training machine learning models due to high dimensional data are basically known as the ‘Curse of Dimensionality’.
There are basically two facets of Curse of dimensionality :--
the available training samples may not have observed targets for all combinations of the attributes. This is because some combination occurs more often than others. Due to this, the training samples available for building the model may not capture all possible combinations. This aspect, where the training samples do not capture all combinations, is referred to as ‘Data sparsity’ or simply ‘sparsity’ in high dimensional data
Training a model with sparse data could lead to high-variance or overfitting conditions. This is because while training the model, the model has learnt from the frequently occurring combinations of the attributes and can predict the outcome accurately. In real-time when less frequently occurring combinations are fed to the model, it may not predict the outcome accurately.
Distance concentration refers to the problem of all the pairwise distances between different samples/points in the space converging to the same value as the dimensionality of the data increases. Several machine learning models such as clustering or nearest neighbours’ methods use distance-based metrics to identify similarities or proximity of the samples. Due to distance concentration, the concept of proximity or similarity of the samples may not be qualitatively relevant in higher dimensions.
To mitigate the problems associated with high dimensional data a suite of techniques generally referred to as ‘Dimensionality reduction techniques are used. Dimensionality reduction techniques fall into one of the two categories-
‘Feature selection’ or ‘Feature extraction
In feature selection techniques, the attributes are tested for their worthiness and then selected or eliminated.
In this technique, the variance in the distribution of all the attributes in a dataset is compared and attributes with very low variance are eliminated. Attributes that do not have such much variance will assume an almost constant value and do not contribute to the predictability of the model.
In this technique, the pair wise correlation between attributes is determined. One of the attributes in the pairs that show very high correlation is eliminated and the other retained. The variability in the eliminated attribute is captured through the retained attribute.
In some cases, the high correlation may not be found for pairs of attributes but if each attribute is regressed as a function of others, we may see that variability of some of the attributes are completely captured by the others. This aspect is referred to as multicollinearity and Variance Inflation Factor (VIF) is a popular technique used to detect multicollinearity. Attributes with high VIF values, generally greater than 10, are eliminated.
Decision Tree models such as CART can rank the attributes based on their importance or contribution to the predictability of the model. In high dimensional data, some of the lower ranked variables could be eliminated to reduce the dimensions
In feature extraction techniques, the high dimensional attributes are combined in low dimensional components (PCA or ICA) or factored into low dimensional factors (FA).
is a dimensionality-reduction technique in which high dimensional correlated data is transformed to a lower dimensional set of uncorrelated components, referred to as principal components. The lower dimensional principle components capture most of the information in the high dimensional dataset. An ‘n’ dimensional data is transformed into ‘n’ principle components and a subset of these ‘n’ principle components is selected based on the percentage of variance in the data intended to be captured through the principle components
In order to solve the problem, we apply various above techniques to reduce the dimensionality of the dataset
print(f"Shape of the dataset:{sc.shape}")
print(f"Number of Records in the dataset:{sc.shape[0]}")
print(f"Number of Features in the dataset:{sc.shape[1]}")
Shape of the dataset:(70, 206) Number of Records in the dataset:70 Number of Features in the dataset:206
Above data contains more features = 206 and very less records = 70 compared to features
X.head()
| d 5100-0 | d 5100-1 | d 5100-2 | d 5100-3 | d 5100-4 | d 5100-8 | d 5100-9 | d 5101-0 | d 5101-1 | d 5101-2 | ... | d 57022-4 | d 57022-8 | d 57022-9 | d 571-0 | d 571-1 | d 571-2 | d 571-3 | d 571-4 | d 571-8 | d 571-9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
5 rows × 203 columns
Above data will be checked for data sparsity,contant value features( low variance features) and further for correlated featues to reduce the dimensionality
For this we use feature_engine library and create a pipeline to drop constant features, correlated features , duplicated features etc. to reduce the dimansionalty of the dataset
# Checking for data sparsity as the data matrix columns contains lots of zero values
for col in X.columns:
print(X[col].value_counts())
0 65 1 5 Name: d 5100-0, dtype: int64 0 60 1 10 Name: d 5100-1, dtype: int64 0 51 1 19 Name: d 5100-2, dtype: int64 0 58 1 12 Name: d 5100-3, dtype: int64 0 46 1 24 Name: d 5100-4, dtype: int64 0 70 Name: d 5100-8, dtype: int64 0 70 Name: d 5100-9, dtype: int64 0 70 Name: d 5101-0, dtype: int64 0 63 1 7 Name: d 5101-1, dtype: int64 0 52 1 18 Name: d 5101-2, dtype: int64 0 54 1 16 Name: d 5101-3, dtype: int64 0 41 1 29 Name: d 5101-4, dtype: int64 0 70 Name: d 5101-8, dtype: int64 0 70 Name: d 5101-9, dtype: int64 0 66 1 4 Name: d 5102-0, dtype: int64 0 65 1 5 Name: d 5102-1, dtype: int64 0 53 1 17 Name: d 5102-2, dtype: int64 0 54 1 16 Name: d 5102-3, dtype: int64 0 42 1 28 Name: d 5102-4, dtype: int64 0 70 Name: d 5102-8, dtype: int64 0 70 Name: d 5102-9, dtype: int64 0 66 1 4 Name: d 5200-0, dtype: int64 0 63 1 7 Name: d 5200-1, dtype: int64 0 63 1 7 Name: d 5200-2, dtype: int64 0 39 1 31 Name: d 5200-3, dtype: int64 0 49 1 21 Name: d 5200-4, dtype: int64 0 70 Name: d 5200-8, dtype: int64 0 70 Name: d 5200-9, dtype: int64 0 65 1 5 Name: d 5201-0, dtype: int64 0 57 1 13 Name: d 5201-1, dtype: int64 0 64 1 6 Name: d 5201-2, dtype: int64 0 50 1 20 Name: d 5201-3, dtype: int64 0 44 1 26 Name: d 5201-4, dtype: int64 0 70 Name: d 5201-8, dtype: int64 0 70 Name: d 5201-9, dtype: int64 0 65 1 5 Name: d 5202-0, dtype: int64 0 58 1 12 Name: d 5202-1, dtype: int64 0 63 1 7 Name: d 5202-2, dtype: int64 0 50 1 20 Name: d 5202-3, dtype: int64 0 44 1 26 Name: d 5202-4, dtype: int64 0 70 Name: d 5202-8, dtype: int64 0 70 Name: d 5202-9, dtype: int64 0 69 1 1 Name: d 5203-0, dtype: int64 0 66 1 4 Name: d 5203-1, dtype: int64 0 58 1 12 Name: d 5203-2, dtype: int64 0 53 1 17 Name: d 5203-3, dtype: int64 1 36 0 34 Name: d 5203-4, dtype: int64 0 70 Name: d 5203-8, dtype: int64 0 70 Name: d 5203-9, dtype: int64 0 69 1 1 Name: d 5204-0, dtype: int64 0 66 1 4 Name: d 5204-1, dtype: int64 0 58 1 12 Name: d 5204-2, dtype: int64 0 53 1 17 Name: d 5204-3, dtype: int64 1 36 0 34 Name: d 5204-4, dtype: int64 0 70 Name: d 5204-8, dtype: int64 0 70 Name: d 5204-9, dtype: int64 0 65 1 5 Name: d 5205-0, dtype: int64 0 58 1 12 Name: d 5205-1, dtype: int64 0 62 1 8 Name: d 5205-2, dtype: int64 0 50 1 20 Name: d 5205-3, dtype: int64 0 45 1 25 Name: d 5205-4, dtype: int64 0 70 Name: d 5205-8, dtype: int64 0 70 Name: d 5205-9, dtype: int64 0 45 1 25 Name: d 53000-0, dtype: int64 0 43 1 27 Name: d 53000-1, dtype: int64 0 63 1 7 Name: d 53000-2, dtype: int64 0 69 1 1 Name: d 53000-3, dtype: int64 0 60 1 10 Name: d 53000-4, dtype: int64 0 70 Name: d 53000-8, dtype: int64 0 70 Name: d 53000-9, dtype: int64 0 68 1 2 Name: d 53001-0, dtype: int64 0 61 1 9 Name: d 53001-1, dtype: int64 0 50 1 20 Name: d 53001-2, dtype: int64 0 68 1 2 Name: d 53001-3, dtype: int64 1 37 0 33 Name: d 53001-4, dtype: int64 0 70 Name: d 53001-8, dtype: int64 0 70 Name: d 53001-9, dtype: int64 0 44 1 26 Name: d 53010-0, dtype: int64 0 43 1 27 Name: d 53010-1, dtype: int64 0 64 1 6 Name: d 53010-2, dtype: int64 0 69 1 1 Name: d 53010-3, dtype: int64 0 60 1 10 Name: d 53010-4, dtype: int64 0 70 Name: d 53010-8, dtype: int64 0 70 Name: d 53010-9, dtype: int64 0 69 1 1 Name: d 53011-0, dtype: int64 0 61 1 9 Name: d 53011-1, dtype: int64 0 49 1 21 Name: d 53011-2, dtype: int64 0 68 1 2 Name: d 53011-3, dtype: int64 1 37 0 33 Name: d 53011-4, dtype: int64 0 70 Name: d 53011-8, dtype: int64 0 70 Name: d 53011-9, dtype: int64 0 70 Name: d 5302-0, dtype: int64 0 70 Name: d 5302-1, dtype: int64 0 67 1 3 Name: d 5302-2, dtype: int64 0 66 1 4 Name: d 5302-3, dtype: int64 0 61 1 9 Name: d 5302-4, dtype: int64 0 70 Name: d 5302-8, dtype: int64 1 54 0 16 Name: d 5302-9, dtype: int64 0 64 1 6 Name: d 5400-0, dtype: int64 0 65 1 5 Name: d 5400-1, dtype: int64 0 54 1 16 Name: d 5400-2, dtype: int64 0 57 1 13 Name: d 5400-3, dtype: int64 0 40 1 30 Name: d 5400-4, dtype: int64 0 70 Name: d 5400-8, dtype: int64 0 70 Name: d 5400-9, dtype: int64 0 64 1 6 Name: d 5401-0, dtype: int64 0 64 1 6 Name: d 5401-1, dtype: int64 0 55 1 15 Name: d 5401-2, dtype: int64 0 55 1 15 Name: d 5401-3, dtype: int64 0 42 1 28 Name: d 5401-4, dtype: int64 0 70 Name: d 5401-8, dtype: int64 0 70 Name: d 5401-9, dtype: int64 0 64 1 6 Name: d 5402-0, dtype: int64 0 65 1 5 Name: d 5402-1, dtype: int64 0 54 1 16 Name: d 5402-2, dtype: int64 0 57 1 13 Name: d 5402-3, dtype: int64 0 40 1 30 Name: d 5402-4, dtype: int64 0 70 Name: d 5402-8, dtype: int64 0 70 Name: d 5402-9, dtype: int64 0 63 1 7 Name: d 5403-0, dtype: int64 0 62 1 8 Name: d 5403-1, dtype: int64 0 47 1 23 Name: d 5403-2, dtype: int64 0 66 1 4 Name: d 5403-3, dtype: int64 0 42 1 28 Name: d 5403-4, dtype: int64 0 70 Name: d 5403-8, dtype: int64 0 70 Name: d 5403-9, dtype: int64 0 53 1 17 Name: d 5404-0, dtype: int64 1 41 0 29 Name: d 5404-1, dtype: int64 0 66 1 4 Name: d 5404-2, dtype: int64 0 70 Name: d 5404-3, dtype: int64 0 62 1 8 Name: d 5404-4, dtype: int64 0 70 Name: d 5404-8, dtype: int64 0 70 Name: d 5404-9, dtype: int64 1 52 0 18 Name: d 5500-0, dtype: int64 0 56 1 14 Name: d 5500-1, dtype: int64 0 68 1 2 Name: d 5500-2, dtype: int64 0 70 Name: d 5500-3, dtype: int64 0 68 1 2 Name: d 5500-4, dtype: int64 0 70 Name: d 5500-8, dtype: int64 0 70 Name: d 5500-9, dtype: int64 0 61 1 9 Name: d 5501-0, dtype: int64 0 58 1 12 Name: d 5501-1, dtype: int64 0 62 1 8 Name: d 5501-2, dtype: int64 0 53 1 17 Name: d 5501-3, dtype: int64 0 46 1 24 Name: d 5501-4, dtype: int64 0 70 Name: d 5501-8, dtype: int64 0 70 Name: d 5501-9, dtype: int64 1 53 0 17 Name: d 5600-0, dtype: int64 0 57 1 13 Name: d 5600-1, dtype: int64 0 68 1 2 Name: d 5600-2, dtype: int64 0 70 Name: d 5600-3, dtype: int64 0 68 1 2 Name: d 5600-4, dtype: int64 0 70 Name: d 5600-8, dtype: int64 0 70 Name: d 5600-9, dtype: int64 0 59 1 11 Name: d 5602-0, dtype: int64 0 59 1 11 Name: d 5602-1, dtype: int64 0 54 1 16 Name: d 5602-2, dtype: int64 0 61 1 9 Name: d 5602-3, dtype: int64 0 47 1 23 Name: d 5602-4, dtype: int64 0 70 Name: d 5602-8, dtype: int64 0 70 Name: d 5602-9, dtype: int64 0 66 1 4 Name: d 5700-0, dtype: int64 0 49 1 21 Name: d 5700-1, dtype: int64 0 56 1 14 Name: d 5700-2, dtype: int64 0 48 1 22 Name: d 5700-3, dtype: int64 0 61 1 9 Name: d 5700-4, dtype: int64 0 70 Name: d 5700-8, dtype: int64 0 70 Name: d 5700-9, dtype: int64 0 64 1 6 Name: d 5701-0, dtype: int64 0 47 1 23 Name: d 5701-1, dtype: int64 0 54 1 16 Name: d 5701-2, dtype: int64 0 52 1 18 Name: d 5701-3, dtype: int64 0 63 1 7 Name: d 5701-4, dtype: int64 0 70 Name: d 5701-8, dtype: int64 0 70 Name: d 5701-9, dtype: int64 0 64 1 6 Name: d 57020-0, dtype: int64 0 37 1 33 Name: d 57020-1, dtype: int64 0 54 1 16 Name: d 57020-2, dtype: int64 0 58 1 12 Name: d 57020-3, dtype: int64 0 67 1 3 Name: d 57020-4, dtype: int64 0 70 Name: d 57020-8, dtype: int64 0 70 Name: d 57020-9, dtype: int64 0 58 1 12 Name: d 57021-0, dtype: int64 0 39 1 31 Name: d 57021-1, dtype: int64 0 53 1 17 Name: d 57021-2, dtype: int64 0 64 1 6 Name: d 57021-3, dtype: int64 0 66 1 4 Name: d 57021-4, dtype: int64 0 70 Name: d 57021-5, dtype: int64 0 70 Name: d 57021-6, dtype: int64 1 53 0 17 Name: d 57022-0, dtype: int64 0 56 1 14 Name: d 57022-1, dtype: int64 0 69 1 1 Name: d 57022-2, dtype: int64 0 69 1 1 Name: d 57022-3, dtype: int64 0 69 1 1 Name: d 57022-4, dtype: int64 0 70 Name: d 57022-8, dtype: int64 0 70 Name: d 57022-9, dtype: int64 0 62 1 8 Name: d 571-0, dtype: int64 0 47 1 23 Name: d 571-1, dtype: int64 0 48 1 22 Name: d 571-2, dtype: int64 0 59 1 11 Name: d 571-3, dtype: int64 0 64 1 6 Name: d 571-4, dtype: int64 0 70 Name: d 571-8, dtype: int64 0 70 Name: d 571-9, dtype: int64
from feature_engine.selection import DropConstantFeatures,DropDuplicateFeatures,SmartCorrelatedSelection,DropCorrelatedFeatures
from sklearn.pipeline import Pipeline
pipe1 = Pipeline([('constant',DropConstantFeatures(tol=0.99)),
('duplicate',DropDuplicateFeatures()),
('correlation',DropCorrelatedFeatures(threshold=0.5)),
('scaler',StandardScaler()),
])
pipe1.fit(X)
Pipeline(steps=[('constant', DropConstantFeatures(tol=0.99)),
('duplicate', DropDuplicateFeatures()),
('correlation', DropCorrelatedFeatures(threshold=0.5)),
('scaler', StandardScaler())])
print(f"Constant or Quassi constant features to drop:-{len(pipe1.named_steps.constant.features_to_drop_)}")
# pipe1.named_steps.constant.features_to_drop_
Constant or Quassi constant features to drop:-63
print(f"Duplicate features to drop:-{len(pipe1.named_steps.duplicate.features_to_drop_)}")
Duplicate features to drop:-27
print(f"Correlated features to drop:-{len(pipe1.named_steps.correlation.features_to_drop_)}")
pipe1.named_steps.correlation.features_to_drop_
Correlated features to drop:-70
{'d 5101-1',
'd 5101-4',
'd 5102-0',
'd 5102-1',
'd 5102-2',
'd 5102-3',
'd 5102-4',
'd 5200-0',
'd 5200-1',
'd 5200-4',
'd 5201-0',
'd 5201-1',
'd 5201-2',
'd 5201-3',
'd 5201-4',
'd 5202-1',
'd 5203-3',
'd 5203-4',
'd 5205-2',
'd 5205-4',
'd 53000-1',
'd 53001-1',
'd 53001-2',
'd 53001-4',
'd 53010-0',
'd 53010-1',
'd 53010-2',
'd 53011-0',
'd 53011-1',
'd 53011-2',
'd 5302-9',
'd 5400-0',
'd 5400-1',
'd 5400-2',
'd 5400-3',
'd 5400-4',
'd 5401-1',
'd 5401-2',
'd 5401-3',
'd 5401-4',
'd 5403-0',
'd 5403-1',
'd 5403-2',
'd 5403-3',
'd 5404-0',
'd 5500-1',
'd 5500-2',
'd 5501-0',
'd 5501-3',
'd 5501-4',
'd 5600-0',
'd 5600-1',
'd 5602-0',
'd 5602-1',
'd 5602-2',
'd 5602-4',
'd 5701-0',
'd 5701-1',
'd 5701-3',
'd 5701-4',
'd 57020-2',
'd 57020-4',
'd 57021-0',
'd 57021-1',
'd 57021-3',
'd 57021-4',
'd 57022-1',
'd 571-0',
'd 571-1',
'd 571-4'}
print(f"Before applying techniques used above to reduce dimensionality,\nshape :{X.shape}")
Before applying techniques used above to reduce dimensionality, shape :(70, 203)
# applying the above dimensionalty reduction technique
X_reduce = pd.DataFrame(pipe1.transform(X))
print(f"After applying techniques used above to reduce dimensionality,\nshape :{X_reduce.shape}")
After applying techniques used above to reduce dimensionality, shape :(70, 43)
So, By looking constant variance, correlated features, we are able to reduce the dimensionality to 43 prior to 203
Now the dataset will be evaluated using feature extraction technique called PCA to further reduce dimensionality
# Applying pca on reduce feature dataset
pca = PCA()
pca.fit(X_reduce)
fig = px.bar(x=[i + 1 for i in range(len(pca.explained_variance_ratio_))],
y=pca.explained_variance_ratio_,labels={"x":"Principal Components","y":"Explained variance ratio"})
fig.show()
fig = px.line(x=[i + 1 for i in range(len(pca.explained_variance_ratio_))],
y=pca.explained_variance_ratio_.cumsum(),title = 'Explained Variance Ratio for \nfitted components',
labels={"x":"Number of Components","y":"Cumalative Explained Variance Ratio"},height=500,width=1000,markers=True
)
fig.show()
It is recommended that approx 80% variance of data should be preserved and from above line plot , 15 PCA components are able to capture that much variance
So we reduce dimensionality further by projecting the above reduced 43 features into 15 Principal components
pca_15 = PCA(n_components=15,random_state = 42)
pca_15_transform_df = pca_15.fit_transform(X_reduce)
wcss = [] # empty list to store distortions fr fdiffernt number of clusters
values = range(2,10)
li = []
for k in values:
kmm = KMeans(n_clusters=k,random_state=42)
kmm.fit(pca_15_transform_df)
wcss.append(kmm.inertia_)
li.append(f"For n_clusters = {k},inertia :{kmm.inertia_}")
fig = px.line(x=values, y=wcss,title = 'Distortions vs number of clusters',
labels={"x":"Number_of_clusters","y":"Distortions"},
height=500,width=900,markers=True)
fig.show()
sl = []
# values = range(2,50)
for k in values:
kmeans = KMeans(n_clusters=k,random_state=42)
predict = kmeans.fit_predict(pca_15_transform_df)
score = silhouette_score(pca_15_transform_df,predict,random_state=1)
sl.append(score)
# print(f"For n_clusters = {k},silhouette score :{score}")
fig = px.line(x=values, y=sl,title = 'silhouette score',
labels={"x":"number_of_clusters","y":"silhouette score"},
height=500,width=1000,text=values)
fig.show()
From analysis by looking the graphs above we can see that after reducing our features from 203 to just 15 we are getting the same result of 7 clusters
Here we lost 20% of the information and still got the decent result
Now KMeans is applied on the reduced feature dataset to find the optimal number of segments/clusters
# Applying KMEANS clustering to reduced feature dataset
kmean = KMeans(n_clusters=7)
labels = kmean.fit_predict(pca_15_transform_df)
labels
array([2, 2, 5, 2, 2, 4, 2, 1, 5, 4, 5, 4, 5, 4, 5, 0, 3, 0, 3, 5, 2, 4,
1, 2, 2, 5, 4, 0, 4, 2, 4, 4, 6, 0, 0, 2, 4, 5, 2, 2, 5, 1, 3, 4,
3, 5, 3, 1, 2, 3, 3, 5, 6, 2, 2, 1, 3, 0, 2, 2, 2, 3, 5, 4, 1, 3,
3, 0, 0, 0])
x = pca_15_transform_df[:,0]
y = pca_15_transform_df[:,1]
sns.scatterplot(x = x,y = y,hue=labels,palette='Set1')
plt.xlabel('PC1')
plt.ylabel('PC2')
# fig = px.scatter(x=x, y=y, color=labels,height=500,width=500,labels={"x":"PC1","y":"PC2"})
# fig.show()
Text(0, 0.5, 'PC2')
From above plot,and analysis by using various feature selection and feature elimaination techniques, we are able to reduce the curse of dimensionality and reduce the feature space from 203 features to just 15 features and still able to segment the dataset into 7 clusters/ classes using Kmeans
Percentage of variance is computed using the explained_variance_ratio parameter on the fitted pca object
print(f"Precentage of variance explained by first 70 components:{round(pca.explained_variance_ratio_[:].sum(),2) * 100}%")
print(f"Precentage of variance explained by first 16 components:{round(pca.explained_variance_ratio_[:16].sum(),2) * 100}%")
print(f"Precentage of variance explained by first 2 components:{round(pca.explained_variance_ratio_[:3].sum(),2) * 100}%")
Precentage of variance explained by first 70 components:100.0% Precentage of variance explained by first 16 components:79.0% Precentage of variance explained by first 2 components:28.999999999999996%
This dataset includes data for the estimation of obesity levels in individuals based on their eating habits and physical condition. The data contains 17 attributes and 2111 records
a. What model have you selected for solving this problem and why?
b. Have you made any assumption for the target variable? If so, then why?
c. What have you done with text variables? Explain.
d. Have you optimized any model parameters? What is the benefit of this action?
e. Have you applied any steps for handling overfitting or underfitting issues? What is that?
In this Problem,Aim is to predict the weight of the person based on certian predictor variables,this defines this as Regression probelm and for this we will be applying various supervised regresson algorthms and based ono the performace of the model we will select the final model
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('ObesityDataSet.csv')
df.head()
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 21.0 | 1.62 | 64.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 0.0 | 1.0 | no | Public_Transportation | Normal_Weight |
| 1 | Female | 21.0 | 1.52 | 56.0 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.0 | yes | 3.0 | 0.0 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | Male | 23.0 | 1.80 | 77.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 1.0 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Male | 27.0 | 1.80 | 87.0 | no | no | 3.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 0.0 | Frequently | Walking | Overweight_Level_I |
| 4 | Male | 22.0 | 1.78 | 89.8 | no | no | 2.0 | 1.0 | Sometimes | no | 2.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_II |
df.shape
(2111, 17)
df.isna().sum()
Gender 0 Age 0 Height 0 Weight 0 family_history_with_overweight 0 FAVC 0 FCVC 0 NCP 0 CAEC 0 SMOKE 0 CH2O 0 SCC 0 FAF 0 TUE 0 CALC 0 MTRANS 0 NObeyesdad 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2111 entries, 0 to 2110 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 2111 non-null object 1 Age 2111 non-null float64 2 Height 2111 non-null float64 3 Weight 2111 non-null float64 4 family_history_with_overweight 2111 non-null object 5 FAVC 2111 non-null object 6 FCVC 2111 non-null float64 7 NCP 2111 non-null float64 8 CAEC 2111 non-null object 9 SMOKE 2111 non-null object 10 CH2O 2111 non-null float64 11 SCC 2111 non-null object 12 FAF 2111 non-null float64 13 TUE 2111 non-null float64 14 CALC 2111 non-null object 15 MTRANS 2111 non-null object 16 NObeyesdad 2111 non-null object dtypes: float64(8), object(9) memory usage: 280.5+ KB
sns.displot(df['Age'],bins = 20)
C:\Users\harpr\anaconda3\lib\site-packages\seaborn\axisgrid.py:88: UserWarning: The figure layout has changed to tight
<seaborn.axisgrid.FacetGrid at 0x2b9267127c0>
The Target variable is Weight and it is correlated with the predictor variables. So Linear regression is suitable to be applied on this first
sns.scatterplot(y=df['Weight'],x = df['Height'],)
<Axes: xlabel='Height', ylabel='Weight'>
From above plot , It seems that Weight(Target) and Height(Predictor) are correlated to each other
Further assumption to be considered:--
The observations are independent of each other.
The errors follow a normal distribution.
The independent variables are not highly correlated with each other.
Homoscedasticity: The variance of the errors is constant across all levels of the independent variables
df['Weight'].corr(df['Height'])
0.4631361166156269
sns.heatmap(df.corr(),annot = True)
<Axes: >
df.columns
Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
'CALC', 'MTRANS', 'NObeyesdad'],
dtype='object')
# import plotly.express as px
# # df = px.data.tips()
# plt.figure(figsize=(12,6))
# fig = px.histogram(df, x="Weight", color="Gender", marginal="rug")
# fig.show()
df = df.drop(columns = ['NObeyesdad'])
df.head()
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 21.0 | 1.62 | 64.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 0.0 | 1.0 | no | Public_Transportation |
| 1 | Female | 21.0 | 1.52 | 56.0 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.0 | yes | 3.0 | 0.0 | Sometimes | Public_Transportation |
| 2 | Male | 23.0 | 1.80 | 77.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 1.0 | Frequently | Public_Transportation |
| 3 | Male | 27.0 | 1.80 | 87.0 | no | no | 3.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 0.0 | Frequently | Walking |
| 4 | Male | 22.0 | 1.78 | 89.8 | no | no | 2.0 | 1.0 | Sometimes | no | 2.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation |
The columns which are not numerical will be converted to numeric columns using pd.get_dummies functions so that machine learning algorith can treat those variables
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2111 entries, 0 to 2110 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 2111 non-null object 1 Age 2111 non-null float64 2 Height 2111 non-null float64 3 Weight 2111 non-null float64 4 family_history_with_overweight 2111 non-null object 5 FAVC 2111 non-null object 6 FCVC 2111 non-null float64 7 NCP 2111 non-null float64 8 CAEC 2111 non-null object 9 SMOKE 2111 non-null object 10 CH2O 2111 non-null float64 11 SCC 2111 non-null object 12 FAF 2111 non-null float64 13 TUE 2111 non-null float64 14 CALC 2111 non-null object 15 MTRANS 2111 non-null object dtypes: float64(8), object(8) memory usage: 264.0+ KB
print(f"Text variable columns :\n{list(df.select_dtypes(include=['object','category']).columns)}")
Text variable columns : ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
df_converted = pd.get_dummies(df,drop_first=True)
df_converted.head()
| Age | Height | Weight | FCVC | NCP | CH2O | FAF | TUE | Gender_Male | family_history_with_overweight_yes | ... | CAEC_no | SMOKE_yes | SCC_yes | CALC_Frequently | CALC_Sometimes | CALC_no | MTRANS_Bike | MTRANS_Motorbike | MTRANS_Public_Transportation | MTRANS_Walking | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 21.0 | 1.62 | 64.0 | 2.0 | 3.0 | 2.0 | 0.0 | 1.0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 21.0 | 1.52 | 56.0 | 3.0 | 3.0 | 3.0 | 3.0 | 0.0 | 0 | 1 | ... | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 23.0 | 1.80 | 77.0 | 2.0 | 3.0 | 2.0 | 2.0 | 1.0 | 1 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 27.0 | 1.80 | 87.0 | 3.0 | 3.0 | 2.0 | 2.0 | 0.0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 22.0 | 1.78 | 89.8 | 2.0 | 1.0 | 2.0 | 0.0 | 0.0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
5 rows × 23 columns
df_converted.shape
(2111, 23)
# Preprocessing the data to create models
X = df_converted.drop(columns = ['Weight'])
y = df_converted['Weight']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
# Creating a function to create model and report performance metrics
Name = []
training_score = []
testing_score = []
Mean_absolute_error = []
Root_Mean_absolute_error = []
def create_model(model_name,model,X_train,y_train,X_test,y_test):
print(f"Using {model_name}")
model.fit(X_train,y_train)
print(f"Model score on training Data:{model.score(X_train,y_train)}")
y_pred = model.predict(X_test)
print(f"Model score on testing Data:{model.score(X_test,y_test)}")
print(f"Mean_absolute_error: {mean_absolute_error(y_test,y_pred)}")
print(f"Root_Mean_squared_error: {mean_squared_error(y_test,y_pred,squared=False)}")
Name.append(model_name)
training_score.append(model.score(X_train,y_train))
testing_score.append(model.score(X_test,y_test))
Mean_absolute_error.append(mean_absolute_error(y_test,y_pred))
Root_Mean_absolute_error.append(mean_squared_error(y_test,y_pred,squared=False))
return model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Here , we are going to create various regression models with their parameters being tuned/optimized for better results.This would help in getting good result on testing data
lr = LinearRegression()
model = create_model('Linear_Regression',lr,X_train_scaled,y_train,X_test_scaled,y_test)
Using Linear_Regression Model score on training Data:0.5808885596803982 Model score on testing Data:0.5828135132679231 Mean_absolute_error: 13.653608738653872 Root_Mean_squared_error: 17.155531640918152
ls = Lasso()
param = {'alpha':[0.1,0.5,1]}
skf = KFold(n_splits=5,shuffle=True)
grid = GridSearchCV(ls,param_grid=param,cv=skf,refit=True,verbose=1,return_train_score=True)
model = create_model('lasso',grid,X_train_scaled,y_train,X_test_scaled,y_test)
print(f"Best parameters for model:{model.best_params_}")
result = pd.DataFrame(model.cv_results_).sort_values(by = 'rank_test_score')
result[['params','mean_test_score','mean_train_score']].head()
Using lasso
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Model score on training Data:0.5802176604982587
Model score on testing Data:0.5822939401233771
Mean_absolute_error: 13.710866378457993
Root_Mean_squared_error: 17.16621125333108
Best parameters for model:{'alpha': 0.1}
| params | mean_test_score | mean_train_score | |
|---|---|---|---|
| 0 | {'alpha': 0.1} | 0.558384 | 0.581642 |
| 1 | {'alpha': 0.5} | 0.554537 | 0.574481 |
| 2 | {'alpha': 1} | 0.543102 | 0.560802 |
el = ElasticNet()
param = {'alpha':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
skf = KFold(n_splits=5,shuffle=True)
grid = GridSearchCV(el,param_grid=param,cv=skf,refit=True,verbose=1,return_train_score=True)
model = create_model("ElasticNet",grid,X_train_scaled,y_train,X_test_scaled,y_test)
print(f"Best parameters for model:{model.best_params_}")
result = pd.DataFrame(model.cv_results_).sort_values(by = 'rank_test_score')
result[['params','mean_test_score','mean_train_score']].head()
Using ElasticNet
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Model score on training Data:0.5784871951019938
Model score on testing Data:0.5806723262284612
Mean_absolute_error: 13.76348204557237
Root_Mean_squared_error: 17.199500214860194
Best parameters for model:{'alpha': 0.1}
| params | mean_test_score | mean_train_score | |
|---|---|---|---|
| 0 | {'alpha': 0.1} | 0.565800 | 0.579951 |
| 1 | {'alpha': 0.2} | 0.562454 | 0.574946 |
| 2 | {'alpha': 0.3} | 0.557300 | 0.568649 |
| 3 | {'alpha': 0.4} | 0.551693 | 0.561954 |
| 4 | {'alpha': 0.5} | 0.545924 | 0.555166 |
rf = RandomForestRegressor()
test = []
train = []
r = [3,4,5,6,7,8,9,10,11,12,13,14]
for t in r:
rf = RandomForestRegressor(max_depth=t)
rf.fit(X_train_scaled,y_train)
y_pred = rf.predict(X_test_scaled)
train.append(rf.score(X_train_scaled,y_train))
test.append(rf.score(X_test_scaled,y_test))
plt.plot(r,train,label='train')
plt.plot(r,test,label='test')
plt.ylabel('score')
plt.xlabel('depth of tree')
plt.legend()
<matplotlib.legend.Legend at 0x2b9276878b0>
param = {'n_estimators':[100,200,250,300],
'max_features' : ["auto", "sqrt", "log2"],
'max_depth':[5,6,7,8,9]}
skf = KFold(n_splits=5,shuffle=True)
grid = GridSearchCV(rf,param_grid=param,cv=skf,refit=True,verbose=1,return_train_score=True)
model = create_model("Random_forest",grid,X_train_scaled,y_train,X_test_scaled,y_test)
print(f"Best parameters for model:{model.best_params_}")
result = pd.DataFrame(model.cv_results_).sort_values(by = 'rank_test_score')
result[['params','mean_test_score','mean_train_score']].head()
Using Random_forest
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Model score on training Data:0.9614081298279076
Model score on testing Data:0.8663450417519792
Mean_absolute_error: 6.213822864175021
Root_Mean_squared_error: 9.710276624135686
Best parameters for model:{'max_depth': 9, 'max_features': 'auto', 'n_estimators': 100}
| params | mean_test_score | mean_train_score | |
|---|---|---|---|
| 48 | {'max_depth': 9, 'max_features': 'auto', 'n_es... | 0.859221 | 0.963668 |
| 51 | {'max_depth': 9, 'max_features': 'auto', 'n_es... | 0.859104 | 0.963896 |
| 50 | {'max_depth': 9, 'max_features': 'auto', 'n_es... | 0.857884 | 0.963792 |
| 57 | {'max_depth': 9, 'max_features': 'log2', 'n_es... | 0.856880 | 0.930614 |
| 49 | {'max_depth': 9, 'max_features': 'auto', 'n_es... | 0.856354 | 0.963730 |
xg = XGBRegressor()
param = {'n_estimators':[100,200,250],
# 'max_features' : ["auto", "sqrt", "log2"],
'learning_rate':[0.1,0.05,0.2],
'max_depth':[5,6,7,8,9]}
skf = KFold(n_splits=5,shuffle=True)
grid = GridSearchCV(xg,param_grid=param,cv=skf,refit=True,verbose=1,return_train_score=True)
model = create_model('XGBoost',grid,X_train_scaled,y_train,X_test_scaled,y_test)
print(f"Best parameters for model:{model.best_params_}")
result = pd.DataFrame(model.cv_results_).sort_values(by = 'rank_test_score')
result[['params','mean_test_score','mean_train_score']].head()
Using XGBoost
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Model score on training Data:0.9981127252125205
Model score on testing Data:0.8725395224306163
Mean_absolute_error: 5.808542445728975
Root_Mean_squared_error: 9.48258699213045
Best parameters for model:{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
| params | mean_test_score | mean_train_score | |
|---|---|---|---|
| 7 | {'learning_rate': 0.1, 'max_depth': 7, 'n_esti... | 0.890363 | 0.998714 |
| 8 | {'learning_rate': 0.1, 'max_depth': 7, 'n_esti... | 0.890311 | 0.999360 |
| 26 | {'learning_rate': 0.05, 'max_depth': 8, 'n_est... | 0.890064 | 0.998198 |
| 25 | {'learning_rate': 0.05, 'max_depth': 8, 'n_est... | 0.889516 | 0.996847 |
| 10 | {'learning_rate': 0.1, 'max_depth': 8, 'n_esti... | 0.888763 | 0.999701 |
gd = GradientBoostingRegressor()
param = {'n_estimators':[100,200,250,300,400],
'learning_rate':[0.1,0.05,0.2],
'max_depth':[5,6,7,8,9]}
# 'max_depth':[3,4,5]}
skf = KFold(n_splits=5,shuffle=True)
grid = GridSearchCV(gd,param_grid=param,cv=skf,refit=True,verbose=1,return_train_score=True)
model = create_model('GradientBoostingRegressor',grid,X_train_scaled,y_train,X_test_scaled,y_test)
print(f"Best parameters for model:{model.best_params_}")
result = pd.DataFrame(model.cv_results_).sort_values(by = 'rank_test_score')
result[['params','mean_test_score','mean_train_score']].head()
Using GradientBoostingRegressor
Fitting 5 folds for each of 75 candidates, totalling 375 fits
Model score on training Data:0.9996550407663248
Model score on testing Data:0.8683131457166835
Mean_absolute_error: 5.656004997996309
Root_Mean_squared_error: 9.638518307154278
Best parameters for model:{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 250}
| params | mean_test_score | mean_train_score | |
|---|---|---|---|
| 12 | {'learning_rate': 0.1, 'max_depth': 7, 'n_esti... | 0.884547 | 0.999837 |
| 13 | {'learning_rate': 0.1, 'max_depth': 7, 'n_esti... | 0.882983 | 0.999930 |
| 37 | {'learning_rate': 0.05, 'max_depth': 7, 'n_est... | 0.882310 | 0.998056 |
| 11 | {'learning_rate': 0.1, 'max_depth': 7, 'n_esti... | 0.881764 | 0.999602 |
| 34 | {'learning_rate': 0.05, 'max_depth': 6, 'n_est... | 0.881657 | 0.997535 |
ada= AdaBoostRegressor()
param = {'n_estimators':[50,100,200,300,400],
'learning_rate':[0.005, 0.001, 0.01, 0.1, 1.0]}
skf = KFold(n_splits=5,shuffle=True)
grid = GridSearchCV(ada,param_grid=param,cv=skf,refit=True,verbose=1,return_train_score=True)
model = create_model('AdaBoosting',grid,X_train_scaled,y_train,X_test_scaled,y_test)
print(f"Best parameters for model:{model.best_params_}")
result = pd.DataFrame(model.cv_results_).sort_values(by = 'rank_test_score')
result[['params','mean_test_score','mean_train_score']].head()
Using AdaBoosting
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Model score on training Data:0.722815697590329
Model score on testing Data:0.6804841855699005
Mean_absolute_error: 12.624360221520202
Root_Mean_squared_error: 15.013613935246953
Best parameters for model:{'learning_rate': 1.0, 'n_estimators': 50}
| params | mean_test_score | mean_train_score | |
|---|---|---|---|
| 20 | {'learning_rate': 1.0, 'n_estimators': 50} | 0.697331 | 0.723666 |
| 19 | {'learning_rate': 0.1, 'n_estimators': 400} | 0.695563 | 0.732502 |
| 22 | {'learning_rate': 1.0, 'n_estimators': 200} | 0.695172 | 0.730378 |
| 18 | {'learning_rate': 0.1, 'n_estimators': 300} | 0.692304 | 0.730851 |
| 21 | {'learning_rate': 1.0, 'n_estimators': 100} | 0.691165 | 0.725933 |
df = pd.DataFrame({'Name':Name,'Training_Score':training_score,'Testing_score':testing_score,
'Mean_absolute_error':Mean_absolute_error,
'Root_Mean_absolute_error':Root_Mean_absolute_error})
df.sort_values(by = ['Testing_score'],ascending=False)
| Name | Training_Score | Testing_score | Mean_absolute_error | Root_Mean_absolute_error | |
|---|---|---|---|---|---|
| 4 | XGBoost | 0.998113 | 0.872540 | 5.808542 | 9.482587 |
| 5 | GradientBoostingRegressor | 0.999655 | 0.868313 | 5.656005 | 9.638518 |
| 3 | Random_forest | 0.961408 | 0.866345 | 6.213823 | 9.710277 |
| 6 | AdaBoosting | 0.722816 | 0.680484 | 12.624360 | 15.013614 |
| 0 | Linear_Regression | 0.580889 | 0.582814 | 13.653609 | 17.155532 |
| 1 | lasso | 0.580218 | 0.582294 | 13.710866 | 17.166211 |
| 2 | ElasticNet | 0.578487 | 0.580672 | 13.763482 | 17.199500 |
e. Have you applied any steps for handling overfitting or underfitting issues? What is that?
In order to handle overfitting/underfitting in developing the model, various techniques are performed to tacke this issue:-
Lasso Regression is used abovealong with ElasticNet with Linear Regression
This technique is used in all above trained models
df = pd.DataFrame({'Name':Name,'Training_Score':training_score,'Testing_score':testing_score,
'Mean_absolute_error':Mean_absolute_error,
'Root_Mean_absolute_error':Root_Mean_absolute_error})
df.sort_values(by = ['Testing_score'],ascending=False)
| Name | Training_Score | Testing_score | Mean_absolute_error | Root_Mean_absolute_error | |
|---|---|---|---|---|---|
| 4 | XGBoost | 0.998113 | 0.872540 | 5.808542 | 9.482587 |
| 5 | GradientBoostingRegressor | 0.999655 | 0.868313 | 5.656005 | 9.638518 |
| 3 | Random_forest | 0.961408 | 0.866345 | 6.213823 | 9.710277 |
| 6 | AdaBoosting | 0.722816 | 0.680484 | 12.624360 | 15.013614 |
| 0 | Linear_Regression | 0.580889 | 0.582814 | 13.653609 | 17.155532 |
| 1 | lasso | 0.580218 | 0.582294 | 13.710866 | 17.166211 |
| 2 | ElasticNet | 0.578487 | 0.580672 | 13.763482 | 17.199500 |
From above dataframe, we can see :-
Linear regression and LASSO performed poorly
ADABoost regressor did a moderate job
XGBOOST, GradientBoosting Regressor and Random Forest performed decent
But XGBOOST and Gradient Boosting overfits the data as they have almost 100% training
score
Random Forest can be chosen as the final model for this as it is performing well both on
training as well as on testing data
rf = RandomForestRegressor(random_state=42)
param = {'n_estimators':[50,60,70,80,90,100],
'min_samples_leaf':[1,2,3,4],
'min_samples_split':[2,3,4],
'max_features' : ["auto", "sqrt", "log2"],
'max_depth':[3,4,5,6,7,8,9]}
skf = KFold(n_splits=5,shuffle=True)
grid = GridSearchCV(rf,param_grid=param,cv=skf,refit=True,verbose=1)
model = create_model("Random_forest",grid,X_train_scaled,y_train,X_test_scaled,y_test)
print(f"Best parameters for model:{model.best_params_}")
Using Random_forest
Fitting 5 folds for each of 1512 candidates, totalling 7560 fits
Model score on training Data:0.9605529166947031
Model score on testing Data:0.8645090332782871
Mean_absolute_error: 6.340485191965618
Root_Mean_squared_error: 9.776743822014172
Best parameters for model:{'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 90}
Random forest model was further optimzed to on various parameters such as n_estimators,max_depth,max_features etc to handle overfitting as well as underfitting
The model is now showing decent score on training as well as on testing data
https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/#5
https://www.analyticsvidhya.com/blog/2022/02/a-comprehensive-guide-on-hyperparameter-tuning-and-its-techniques/
https://hastie.su.domains/ISLP/ISLP_website.pdf
https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-k-means-clustering/
https://www.bioinformatics.babraham.ac.uk/training/10XRNASeq/Dimension%20Reduction.pdf
http://theprofessionalspoint.blogspot.com/2019/03/advantages-and-disadvantages-of-t-sne.html
https://www.educba.com/density-based-clustering/
https://plotly.com/graphing-libraries/
https://seaborn.pydata.org/index.html
https://pypi.org/project/feature-engine/
https://www.datacamp.com/blog